In [150]:
%matplotlib inline
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from pandas import DataFrame, Series
pd.set_option("display.max_columns", None)
Load Data into DataFrame
In [62]:
ManClus=csv.reader(open('MMETSP_140513_Cluster.tab'),delimiter='\t') #Manual clusters
AllClus=csv.reader(open('MMETSP_HigherOrder.tab'),delimiter='\t')
ClusCount=open('SummedSpecies.tab')
mpl.rcParams['pdf.fonttype'] = 42
Columns=["S1", "S2", "S3", "S4", 'S5', 'A', 'B', 'C', 'D', 'E']
data=pd.read_table(ClusCount, names=Columns, index_col=0)
Create two hashes for phylum and genus level to sort the counts into two new Pandas data frames
In [149]:
GenusHash={}
PhylumHash={}
for S in MC_hash:
key=MC_hash[S][0]
G=MMETSP_Hash[key][3]
P=MMETSP_Hash[key][0]
ShortName=MC_hash[S]
if G in GenusHash:
GenusHash[G].append(S)
else:
GenusHash[G]=[S]
if P in PhylumHash:
PhylumHash[P][0].append(S)
PhylumHash[P][1].append(G)
else:
PhylumHash[P]=[[],[]]
PhylumHash[P][0]=[S]
PhylumHash[P][1]=[G]
for key in PhylumHash:
S_List=PhylumHash[key][0]
In [169]:
Species_Counts=data.T
Pdf=pd.DataFrame(index=Columns)
for key in PhylumHash:
S_List=PhylumHash[key][0]
for x in S_List:
if x in set(Species_Counts.columns.values):
if key in Pdf:
Pdf[key]=Pdf[key]+Species_Counts[x]
else:
Pdf[key]=Species_Counts[x]
Pdf['Unaligned']=Species_Counts['Unaligned']
Species_Counts=data.T
Gdf=pd.DataFrame(index=Columns)
for key in GenusHash:
S_List=GenusHash[key]
for x in S_List:
if x in set(Species_Counts.columns.values):
if key in Gdf:
Gdf[key]=Gdf[key]+Species_Counts[x]
else:
Gdf[key]=Species_Counts[x]
Gdf['Unaligned']=Species_Counts['Unaligned']
In [146]:
Gdf.loc['Mean']=Gdf.mean(axis=0)
Gdf.T.sort('Mean', ascending=True)
Out[146]:
In [247]:
#Create a final histogram for plotting
Fdf=Pdf.copy()
Fdf['Skeletonema']=Gdf['Skeletonema']
Fdf['Thalassiosira']=Gdf['Thalassiosira']
Fdf['Prorocentrum']=Gdf['Prorocentrum']
Fdf['Bacillariophyta']=Fdf['Bacillariophyta']-Fdf['Skeletonema']-Fdf['Thalassiosira']
Fdf['Dinophyta']=Fdf['Dinophyta']-Gdf['Prorocentrum']
Fdf.loc['Mean']=Fdf.mean()
Col_order=['Unaligned','Bacillariophyta','Skeletonema', 'Thalassiosira', 'Dinophyta', 'Prorocentrum', 'Ochrophyta', 'Chlorophyta', 'Ciliophora']
Cols=set(Fdf.columns.values)
Cols=list(Cols-set(Col_order))
#Reorder and cluster into "other" catagory
Fdf['Other']=0
Fdf
for x in Cols:
Fdf['Other']=Fdf['Other']+Fdf[x]
Fdf=Fdf.drop(x, axis=1)
Col_order.append('Other')
Col_order=Col_order[::-1]
Fdf=Fdf[Col_order]
Fdf=Fdf.drop('Mean')
Fdf['sum']=Fdf.T.sum()
#Calculate percentage
Fdf_percentage=Fdf.copy()
for x in Col_order:
Fdf_percentage[x]=Fdf_percentage[x]/Fdf_percentage['sum']
Fdf_percentage=Fdf_percentage.T.drop('sum').T
In [259]:
#Plot Stack plot
c=cm.hot(10)
fig=plt.figure()
ax=fig.add_subplot(111)
Fdf_plot=Fdf_percentage.T[['S1','S2','S3','S4','S5']]
ax.stackplot(np.arange(5),Fdf_plot, color=c)
ax.margins(0, 0)
fig.savefig('StackPlot.pdf')
In [246]:
Col_order[::-1]
Out[246]:
In [ ]:
load